Pandas profiling is a framework used in this project to facilitate the study of variables
%%capture
import sys
!{sys.executable} -m pip install -U pandas-profiling[notebook]
!jupyter nbextension enable --py widgetsnbextension
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import plotly.express as px
import plotly.io as pio
# Resize plots
plt.rcParams['figure.figsize'] = [22, 20]
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
df.head()
df.dtypes
profile = ProfileReport(df, title='Scores profiling report', explorative=True)
profile.to_widgets()
# Identifying if the dataset has null values
df.isnull().sum()
# Looks like our dataset has all rows duly filled.
fig = px.box(df, y="math score")
fig.show()
fig = px.histogram(df, x="math score")
fig.show()
# Distribution for reading score
fig = px.box(df, y="reading score")
fig.show()
fig = px.histogram(df, x="reading score")
fig.show()
# Distribution for writing score
fig = px.box(df, y="writing score")
fig.show()
fig = px.histogram(df, x="writing score")
fig.show()
With the df.describe() method and the bloxplots here plotted one can see that the distribution is very similar to each matter.
df.corr().style.background_gradient(cmap="Blues")
We can see that there is a good positive correlation among the variables.
df2 = df['gender'].value_counts('gender')
print(df2)
# Inserting the values above in a python dictionary.
proportion = {}
proportion['female'] = df[df['gender']=='female']['gender'].count()
proportion['male'] = df[df['gender']=='male']['gender'].count()
print(proportion)
# Data to plot
labels = []
sizes = []
for x, y in proportion.items():
labels.append(x)
sizes.append(y)
labels = [x.upper() for x in labels]
# Plot
fig = px.bar(x=labels, y=sizes)
fig.show()
One can see that the proportion is very equal, almost 50/50.
df['parental level of education'].value_counts('parental level of education')
df['lunch'].value_counts('lunch')
One can see that most students opt for standart lunch.
df['race/ethnicity'].value_counts('race/ethnicity')
# Inserting the values above in a python dictionary.
race2 = {}
groups = ['A', 'B', 'C', 'D', 'E']
for i in groups:
race2[f'group_{i}'] = df[df['race/ethnicity']==f'group {i}']['race/ethnicity'].count()
print(race2)
races = list(race2.keys())
values = list(race2.values())
# Plot
fig = px.bar(x=races, y=values)
fig.show()